Measuring importance of coefficients of OLS

$s^2 = \frac{y^TMy}{n -p} = \frac{y^T(y - X\hat{\beta})}{n -p}$

$s.e.(\hat{\beta_{j}}) = \sqrt{s^2(X^TX)^{-1}_{jj}}$

$t = \frac{\hat{\beta}}{s.e.(\hat{\beta})}$

$p = SF(|t|, n-p) * 2$

$c.i. = PPF((1 + confidence)/2, n-p)$


In [1]:
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy import stats
pd.options.display.float_format = '{:,.4f}'.format

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
N = 20000
X = sm.add_constant(np.random.randn(N,3)*10)
w = np.array([0.25, 0.5, 0.3, -0.5])
y = np.dot(X,w) + np.random.randn(X.shape[0])
print X.shape, y.shape, w.shape
plt.plot(X[:, 1], y, "o", X[:, 2], y, "s", X[:, 3], y, "d")


(20000L, 4L) (20000L,) (4L,)
Out[2]:
[<matplotlib.lines.Line2D at 0x1a28ac88>,
 <matplotlib.lines.Line2D at 0x1a28ad68>,
 <matplotlib.lines.Line2D at 0x1a29e400>]

In [3]:
model = sm.OLS(y, X)
res = model.fit()
print res.summary2()


                  Results: Ordinary least squares
===================================================================
Model:              OLS              Adj. R-squared:     0.983     
Dependent Variable: y                AIC:                56860.7399
Date:               2016-02-22 00:58 BIC:                56892.3538
No. Observations:   20000            Log-Likelihood:     -28426.   
Df Model:           3                F-statistic:        3.900e+05 
Df Residuals:       19996            Prob (F-statistic): 0.00      
R-squared:          0.983            Scale:              1.0050    
---------------------------------------------------------------------
           Coef.    Std.Err.       t       P>|t|     [0.025    0.975]
---------------------------------------------------------------------
const      0.2313     0.0071     32.6241   0.0000    0.2174    0.2452
x1         0.4991     0.0007    703.1243   0.0000    0.4977    0.5005
x2         0.2994     0.0007    425.5916   0.0000    0.2980    0.3008
x3        -0.5010     0.0007   -711.3886   0.0000   -0.5024   -0.4996
-------------------------------------------------------------------
Omnibus:              10.248        Durbin-Watson:           1.997 
Prob(Omnibus):        0.006         Jarque-Bera (JB):        11.197
Skew:                 -0.011        Prob(JB):                0.004 
Kurtosis:             3.114         Condition No.:           10    
===================================================================


In [4]:
def R2(y, X, coeffs):
    y_hat = np.dot(X, coeffs)
    y_mean = np.mean(y)
    SST = np.sum((y-y_mean)**2)
    SSR = np.sum((y_hat - y_mean)**2)
    SSE = np.sum((y_hat - y)**2)
    #R_squared = SSR / SST
    R_squared = SSE / SST
    return 1- R_squared

In [5]:
R2(y, X, res.params)


Out[5]:
0.98319694000573365

In [6]:
def se_coeff(y, X, coeffs):
    # Reference: https://en.wikipedia.org/wiki/Ordinary_least_squares#Finite_sample_properties
    s2 = np.dot(y, y - np.dot(X, coeffs)) / (X.shape[0] - X.shape[1]) # Calculate S-squared
    XX = np.diag(np.linalg.inv(np.dot(X.T, X))) # Calculate 
    return np.sqrt(s2*XX)

In [7]:
coeffs = res.params
N, K = X.shape
se = se_coeff(y, X, coeffs)
t = coeffs / se
p = stats.t.sf(np.abs(t), N - K)*2
ci = stats.t.ppf((1 + 0.95)/2, N-K)*se
pd.DataFrame(np.vstack((coeffs, se, t, p, coeffs - ci, coeffs + ci)).T, columns=["coeff", "S.E.", "t", "p-value", "ci-", "ci+"])


Out[7]:
coeff S.E. t p-value ci- ci+
0 0.2313 0.0071 32.6241 0.0000 0.2174 0.2452
1 0.4991 0.0007 703.1243 0.0000 0.4977 0.5005
2 0.2994 0.0007 425.5916 0.0000 0.2980 0.3008
3 -0.5010 0.0007 -711.3886 0.0000 -0.5024 -0.4996

Coefficient Significant for Logistic Regression


In [8]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

In [11]:
plt.clf()
y_sc = sigmoid(np.dot(X,w))
idx = np.argsort(y_sc)
plt.plot(y_sc[idx], "-b", label="logit")
y = stats.bernoulli.rvs(y_sc, size=y_sc.shape[0])
plt.plot(y[idx], "or", label="label")
plt.legend()


Out[11]:
<matplotlib.legend.Legend at 0x1a2004a8>

In [12]:
model = sm.Logit(y, X)
res = model.fit()
print res.summary2()
print w


Optimization terminated successfully.
         Current function value: 0.164233
         Iterations 9
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.763    
Dependent Variable: y                AIC:              6577.3345
Date:               2016-02-22 00:58 BIC:              6608.9484
No. Observations:   20000            Log-Likelihood:   -3284.7  
Df Model:           3                LL-Null:          -13861.  
Df Residuals:       19996            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     9.0000                                      
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
const    0.2180     0.0317     6.8755   0.0000    0.1558    0.2801
x1       0.5046     0.0095    53.3732   0.0000    0.4861    0.5232
x2       0.2980     0.0061    48.7087   0.0000    0.2860    0.3100
x3      -0.5028     0.0094   -53.3659   0.0000   -0.5213   -0.4844
================================================================

[ 0.25  0.5   0.3  -0.5 ]

In [13]:
plt.hist(y)
plt.hist(y_sc)


Out[13]:
(array([ 7590.,   816.,   530.,   468.,   408.,   449.,   460.,   573.,
          825.,  7881.]),
 array([  2.97715082e-13,   1.00000000e-01,   2.00000000e-01,
          3.00000000e-01,   4.00000000e-01,   5.00000000e-01,
          6.00000000e-01,   7.00000000e-01,   8.00000000e-01,
          9.00000000e-01,   1.00000000e+00]),
 <a list of 10 Patch objects>)

In [14]:
plt.plot(X[idx, 1], y_sc[idx], "o", X[idx, 2], y_sc[idx], "s", X[idx, 3], y_sc[idx], "d")


Out[14]:
[<matplotlib.lines.Line2D at 0x1d5ce748>,
 <matplotlib.lines.Line2D at 0x1d5ce828>,
 <matplotlib.lines.Line2D at 0x1d5cee48>]

In [15]:
def se_coeff_logit(y, X, coeffs):
    # Reference: https://en.wikipedia.org/wiki/Ordinary_least_squares#Finite_sample_properties
    s2 = np.dot(y, y - sigmoid(np.dot(X, coeffs))) / (X.shape[0] - X.shape[1]) # Calculate S-squared
    XX = np.diag(np.linalg.inv(np.dot(X.T, X))) # Calculate 
    return np.sqrt(s2*XX)

In [16]:
coeffs = res.params
N, K = X.shape
se = se_coeff_logit(y, X, coeffs)
t = coeffs / se
p = stats.t.sf(np.abs(t), N - K)*2
ci = stats.t.ppf((1 + 0.95)/2, N-K)*se
pd.DataFrame(np.vstack((coeffs, se, t, p, coeffs - ci, coeffs + ci)).T, columns=["coeff", "S.E.", "t", "p-value", "ci-", "ci+"])


Out[16]:
coeff S.E. t p-value ci- ci+
0 0.2180 0.0016 137.2274 0.0000 0.2148 0.2211
1 0.5046 0.0002 3,173.6295 0.0000 0.5043 0.5049
2 0.2980 0.0002 1,891.1343 0.0000 0.2977 0.2983
3 -0.5028 0.0002 -3,187.5882 0.0000 -0.5031 -0.5025

In [ ]: